Part C. APIs

Load all packages here:

suppressMessages(library("httr"))
suppressMessages(library("jsonlite"))
suppressMessages(library("tidyverse"))
suppressMessages(library("sjPlot"))
suppressMessages(library("plotly"))
suppressMessages(library("countrycode"))
  1. Next, the goal is to obtain country-level development indicators that may be related to linguistic fragmentation from the World Bank API. You can read the documentation and instructions of the API here.

Write your own function that will take an indicator code (e.g. SP.POP.TOTL) as input, query the API, parse the JSON output into R, and then return a clean data frame where each row is a country. Feel free to take a look at the following code for some clues on how to query the API. (12 points)

Note: If you are not able to figure this exercise out, you can use the WDI package in the next exercises in order to be able to continue with the assignment.

get_WDI <- function(indicator) {
  # Description: function that gets the indicators stated from the World Bank API
    # args:
      # indicator: a string of length n with the name of the indicator
    # returns: 
      # data_indicator: a data frame with the indicator by country/year
  
  tryCatch({
    # Calling the API
    url_indicator <- paste0("http://api.worldbank.org/v2/country/all/indicator/", indicator, "?format=json")
    
    # Loading the data from the API
    raw_data <- GET(url = url_indicator) %>% 
                content("text", encoding = "UTF-8") %>%
                # automatically 'flatten' nested data frames into a single non-nested data frame
                fromJSON(flatten = TRUE)

    # Total number of observations in list 1
    total_obs <- raw_data[[1]]$total
    
    # Overwriting url_indicator and raw_data to collect all obs
    url_indicator <- paste0("http://api.worldbank.org/v2/country/all/indicator/", indicator, "?format=json&per_page=", total_obs)
    
    # Loading again the data from the API
    raw_data <- GET(url = url_indicator) %>% 
                content("text", encoding = "UTF-8") %>%
                # automatically 'flatten' nested data frames into a single non-nested data frame
                fromJSON(flatten = TRUE)

# Transforming the data (in list 2) into a data frame
    data_indicator <- raw_data[[2]] %>%
                      data.frame() %>% 
                      mutate(country_code = countrycode(countryiso3code, 
                                                        origin = "iso3c", 
                                                        destination = "iso2c"))
    
    return(data_indicator)},
    error = function(e){"This is not a valid indicator"})
}

# This code was adapted from:
# https://stackoverflow.com/questions/59985218/world-bank-api-query
  1. Using the function you just created, download country-level data on GDP per capita and other indicators and metrics that could be relevant (see the Alesina et al paper for inspiration). Merge this new country-level dataset with the dataset part_b_fractionalization_output.csv that you created at the end of Part B. As before, you may need to fix some of the country names to ensure that all countries can be merged. (4 points)
# Reading the dataset generated from Part B
PartB_data <- read.csv('part_b_fractionalization_output.csv', stringsAsFactors = FALSE)

# Using the API query function to get WDI data, dates selected based on the availability of data

GDP_per_capita <- get_WDI("NY.GDP.PCAP.CD") %>% 
                  select(country_code, date, value) %>% 
                  mutate (date = as.numeric(date)) %>% 
                  filter(date >= 2015 & date <= 2020) %>% 
                  group_by(country_code) %>% 
                  summarise(mean_GDPpc = mean(value)) %>% 
                  ungroup()
## Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: , AFE, AFW, ARB, CEB, CHI, CSS, EAP, EAR, EAS, ECA, ECS, EMU, EUU, FCS, HPC, IBD, IBT, IDA, IDB, IDX, LAC, LCN, LDC, LMY, LTE, MEA, MIC, MNA, NAC, OED, OSS, PRE, PSS, PST, SAS, SSA, SSF, SST, TEA, TEC, TLA, TMN, TSA, TSS, WLD, XKX
Unemployment_rate <- get_WDI("SL.UEM.TOTL.ZS") %>% 
                     select(country_code, date, value) %>% 
                     mutate (date = as.numeric(date)) %>% 
                     filter(date >= 2015 & date <= 2020) %>% 
                     group_by(country_code) %>% 
                     summarise(mean_unemployment = mean(value)) %>% 
                     ungroup()
## Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: , AFE, AFW, ARB, CEB, CHI, CSS, EAP, EAR, EAS, ECA, ECS, EMU, EUU, FCS, HPC, IBD, IBT, IDA, IDB, IDX, LAC, LCN, LDC, LMY, LTE, MEA, MIC, MNA, NAC, OED, OSS, PRE, PSS, PST, SAS, SSA, SSF, SST, TEA, TEC, TLA, TMN, TSA, TSS, WLD, XKX
Gini_index <- get_WDI("SI.POV.GINI") %>% 
              select(country_code, date, value) %>% 
              mutate (date = as.numeric(date)) %>% 
              filter(date >= 2015 & date <= 2020) %>% 
              group_by(country_code) %>% 
              summarise(mean_gini = mean(value)) %>% 
              ungroup()
## Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: , AFE, AFW, ARB, CEB, CHI, CSS, EAP, EAR, EAS, ECA, ECS, EMU, EUU, FCS, HPC, IBD, IBT, IDA, IDB, IDX, LAC, LCN, LDC, LMY, LTE, MEA, MIC, MNA, NAC, OED, OSS, PRE, PSS, PST, SAS, SSA, SSF, SST, TEA, TEC, TLA, TMN, TSA, TSS, WLD, XKX
Life_Expectacy <- get_WDI("SP.DYN.LE00.IN") %>% 
                  select(country_code, date, value) %>% 
                  mutate (date = as.numeric(date)) %>% 
                  filter(date >= 2015 & date <= 2020) %>% 
                  group_by(country_code) %>% 
                  summarise(mean_Life_Expectacy = mean(value)) %>% 
                  ungroup()
## Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: , AFE, AFW, ARB, CEB, CHI, CSS, EAP, EAR, EAS, ECA, ECS, EMU, EUU, FCS, HPC, IBD, IBT, IDA, IDB, IDX, LAC, LCN, LDC, LMY, LTE, MEA, MIC, MNA, NAC, OED, OSS, PRE, PSS, PST, SAS, SSA, SSF, SST, TEA, TEC, TLA, TMN, TSA, TSS, WLD, XKX
Literacy <- get_WDI("SE.ADT.LITR.ZS") %>% 
            select(country_code, date, value) %>% 
            mutate (date = as.numeric(date)) %>% 
            filter(date >= 2015 & date <= 2020) %>% 
            group_by(country_code) %>% 
            summarise(Literacy_mean = mean(value)) %>% 
            ungroup()
## Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: , AFE, AFW, ARB, CEB, CHI, CSS, EAP, EAR, EAS, ECA, ECS, EMU, EUU, FCS, HPC, IBD, IBT, IDA, IDB, IDX, LAC, LCN, LDC, LMY, LTE, MEA, MIC, MNA, NAC, OED, OSS, PRE, PSS, PST, SAS, SSA, SSF, SST, TEA, TEC, TLA, TMN, TSA, TSS, WLD, XKX
Infant_Mortality <- get_WDI("SP.DYN.IMRT.IN") %>% 
                    select(country_code, date, value) %>% 
                    mutate (date = as.numeric(date)) %>% 
                    filter(date >= 2015 & date <= 2020) %>% 
                    group_by(country_code) %>% 
                    summarise(Infant_Mortality_mean = mean(value)) %>% 
                    ungroup()
## Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: , AFE, AFW, ARB, CEB, CHI, CSS, EAP, EAR, EAS, ECA, ECS, EMU, EUU, FCS, HPC, IBD, IBT, IDA, IDB, IDX, LAC, LCN, LDC, LMY, LTE, MEA, MIC, MNA, NAC, OED, OSS, PRE, PSS, PST, SAS, SSA, SSF, SST, TEA, TEC, TLA, TMN, TSA, TSS, WLD, XKX
Mobile_Sub <- get_WDI("IT.CEL.SETS.P2") %>% 
              select(country_code, date, value) %>% 
              mutate (date = as.numeric(date)) %>% 
              filter(date >= 2015 & date <= 2020) %>% 
              group_by(country_code) %>% 
              summarise(mean_Mobile_Sub = mean(value)) %>% 
              ungroup()
## Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: , AFE, AFW, ARB, CEB, CHI, CSS, EAP, EAR, EAS, ECA, ECS, EMU, EUU, FCS, HPC, IBD, IBT, IDA, IDB, IDX, LAC, LCN, LDC, LMY, LTE, MEA, MIC, MNA, NAC, OED, OSS, PRE, PSS, PST, SAS, SSA, SSF, SST, TEA, TEC, TLA, TMN, TSA, TSS, WLD, XKX
# Merging tables
all_data <- list(PartB_data, GDP_per_capita, Unemployment_rate, Gini_index, 
                 Life_Expectacy, Literacy, Mobile_Sub, Infant_Mortality)
merged_data <- all_data %>% reduce(left_join, by = "country_code")
head(merged_data)
##   X           country_name language_fractionalization_index_alesina_et_al.
## 1 1                Albania                                          0.0399
## 2 2                Andorra                                          0.6848
## 3 3                Austria                                          0.1522
## 4 4                Belarus                                          0.4666
## 5 5                Belgium                                          0.5409
## 6 6 Bosnia and Herzegovina                                          0.6751
##   tweets_collected country_code language_fractionalization_index_tweets
## 1               42           AL                               0.6734694
## 2               15           AD                               0.5955556
## 3              492           AT                               0.6692610
## 4              198           BY                               0.1627385
## 5             1022           BE                               0.7401167
## 6              133           BA                               0.8642659
##   mean_GDPpc mean_unemployment mean_gini mean_Life_Expectacy Literacy_mean
## 1   4770.653         13.888167        NA            78.37817            NA
## 2  38719.793                NA        NA                  NA            NA
## 3  47853.700          5.321667        NA            81.54268            NA
## 4   6090.935          5.169667      25.2            74.03496            NA
## 5  44426.181          6.710000        NA            81.38496            NA
## 6   5565.235         20.497500        NA            77.19983            NA
##   mean_Mobile_Sub Infant_Mortality_mean
## 1        106.0978              8.533333
## 2        106.1947              2.716667
## 3        127.8842              2.950000
## 4        122.1636              2.633333
## 5        103.6796              3.350000
## 6        104.7418              5.200000
rm(PartB_data, Unemployment_rate, Gini_index, Life_Expectacy, Infant_Mortality, Literacy, Mobile_Sub)

# We collected data on GDP per capita, unemployment rates, the Gini Index, life expectancy, literacy and mobile phone subscriptions. As many countries do not have data for most recent years, we decided to take the average value between 2015 and 2020. By using the average value of those 5 years we can maximize the amount of countries included in the dataset. 

For the remaining exercises use any summary figures, visualization, statistical analyses, etc. that you find helpful to answer the questions. More extensive, insightful, polished, and well described answers will receive higher marks. Also see the assessment criteria on the course website https://lse-my472.github.io/

  1. Using the language fractionalization index from the Alesina et al. paper and data downloaded from the World Bank API, can you roughly replicate some of the findings from the paper? For example, Tables 5 and 8 or other findings? (10 points)
# Calculating the correlations between variables
correlation <- cor(merged_data[c("language_fractionalization_index_alesina_et_al.", "mean_unemployment", "mean_Life_Expectacy", "mean_Mobile_Sub", "Infant_Mortality_mean", "mean_GDPpc")], use = "complete.obs", method = "pearson")

# Replicating Table 5: 
table5 <- tab_corr(correlation,
          title = "Replication of Table 5: Correlation between the linguistic fractionalization and development indicators.",
          triangle = "lower",
          string.diag = c(rep(1, times = ncol(correlation))), #correlation of the same variable equals 1 
          fade.ns = FALSE)

table5
Replication of Table 5: Correlation between the linguistic fractionalization and development indicators.
  language_fractionalization_index_alesina_et_al. mean_unemployment mean_Life_Expectacy mean_Mobile_Sub Infant_Mortality_mean mean_GDPpc
language_fractionalization_index_alesina_et_al. 1          
mean_unemployment 0.176 1        
mean_Life_Expectacy -0.339 -0.077 1      
mean_Mobile_Sub 0.041 -0.306 -0.017 1    
Infant_Mortality_mean 0.222 0.278 -0.642 -0.336 1  
mean_GDPpc -0.050 -0.375 0.737 0.140 -0.557 1
Computed correlation used pearson-method with listwise-deletion.
# In the replication of Table 5, the strongest correlation is between the mean life expectancy and Alesina et al.'s index with a value of -0.339. 

# Regression Analysis
regression1 = lm(mean_GDPpc ~ language_fractionalization_index_alesina_et_al. + 
                             mean_unemployment + mean_Life_Expectacy + 
                              Infant_Mortality_mean, data = merged_data) 
regression2 = lm(mean_GDPpc ~ language_fractionalization_index_alesina_et_al. + 
                             mean_unemployment + mean_Life_Expectacy  +
                            Infant_Mortality_mean + 
                            mean_Mobile_Sub, data = merged_data) 
summary(regression1)
## 
## Call:
## lm(formula = mean_GDPpc ~ language_fractionalization_index_alesina_et_al. + 
##     mean_unemployment + mean_Life_Expectacy + Infant_Mortality_mean, 
##     data = merged_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -23484  -8395  -2160   7322  43806 
## 
## Coefficients:
##                                                   Estimate Std. Error t value
## (Intercept)                                     -414350.93   74272.26  -5.579
## language_fractionalization_index_alesina_et_al.   36104.56   12356.61   2.922
## mean_unemployment                                 -1927.94     516.69  -3.731
## mean_Life_Expectacy                                5719.28     890.93   6.419
## Infant_Mortality_mean                               -36.42    1294.20  -0.028
##                                                 Pr(>|t|)    
## (Intercept)                                     3.04e-06 ***
## language_fractionalization_index_alesina_et_al. 0.006144 ** 
## mean_unemployment                               0.000694 ***
## mean_Life_Expectacy                             2.47e-07 ***
## Infant_Mortality_mean                           0.977712    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14540 on 34 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.7169, Adjusted R-squared:  0.6836 
## F-statistic: 21.53 on 4 and 34 DF,  p-value: 6.343e-09
# Replicating of Table 8:
table8 <- tab_model(regression1, regression2, title ="Replication of Table 8. Language diversity and Development Indicators (Dependent Variable: 5-year Mean of GDP per capita")

table8
Replication of Table 8. Language diversity and Development Indicators (Dependent Variable: 5-year Mean of GDP per capita
  mean_GDPpc mean_GDPpc
Predictors Estimates CI p Estimates CI p
(Intercept) -414350.93 -565290.32 – -263411.54 <0.001 -431368.44 -610101.62 – -252635.27 <0.001
language
fractionalization index
alesina et al
36104.56 10992.91 – 61216.22 0.006 35718.27 10168.66 – 61267.88 0.008
mean unemployment -1927.94 -2977.98 – -877.90 0.001 -1884.98 -2974.94 – -795.02 0.001
mean Life Expectacy 5719.28 3908.68 – 7529.87 <0.001 5814.63 3907.20 – 7722.06 <0.001
Infant Mortality mean -36.42 -2666.56 – 2593.71 0.978 175.68 -2728.87 – 3080.23 0.903
mean Mobile Sub 69.06 -305.54 – 443.67 0.710
Observations 39 39
R2 / R2 adjusted 0.717 / 0.684 0.718 / 0.675
# In replicating table 8, we ran two step-wise linear regressions with the five-year average of GDP per capita. In contradiction with the Alesina et al.'s analysis - our analysis shows a significant positive relationship between language fictionalization and GDP per capita (p = 0.008). This outcomes does not align with the hypothesis of the paper. 
  1. Using the language fractionalization Twitter-based index which you built, can you find interesting correlations with indicators from the World Bank or is the Twitter data too noisy? Are correlations stronger when only countries are considered for which at least a certain amount of tweets were contained in the dataset? A starting point could be to repeat some outcome from 3. now with the Twitter-based index.

A word of caution when interpreting these results: We can form hypotheses based on such findings, but only from the fact that variables co-move/correlate even when controlling for some others variables in a regression, we cannot say whether they cause each other to move or not link (7 points)

# Correlating the language fractionalization Twitter-based index with WDI  (total of 42 countries)
correlation_twitter <- cor(merged_data[c("language_fractionalization_index_tweets", "mean_unemployment", "mean_Life_Expectacy", "mean_Mobile_Sub", "Infant_Mortality_mean", "mean_GDPpc")], use = "complete.obs", method = "pearson")

# Creating a table 
table2 <- tab_corr(correlation_twitter,
          title = "Correlation between the linguistic fractionalization and development indicators.",
          triangle = "lower",
          #correlation of the same variable equals 1 
          string.diag = c(rep(1, times = ncol(correlation))), 
          fade.ns = FALSE)

table2
Correlation between the linguistic fractionalization and development indicators.
  language_fractionalization_index_tweets mean_unemployment mean_Life_Expectacy mean_Mobile_Sub Infant_Mortality_mean mean_GDPpc
language_fractionalization_index_tweets 1          
mean_unemployment 0.019 1        
mean_Life_Expectacy 0.015 -0.077 1      
mean_Mobile_Sub 0.065 -0.306 -0.017 1    
Infant_Mortality_mean 0.022 0.278 -0.642 -0.336 1  
mean_GDPpc 0.130 -0.375 0.737 0.140 -0.557 1
Computed correlation used pearson-method with listwise-deletion.
# Correlating the language fractionalization Twitter-based index with WDI only using countries that have over 500 tweets (total of 17 countries)

# Add condition of min 500 tweets 
max_tweets <- merged_data %>%
              filter(tweets_collected > 500) 

correlation_tweets <- cor(max_tweets[c("language_fractionalization_index_tweets", "mean_unemployment", "mean_Life_Expectacy", "mean_Mobile_Sub", "Infant_Mortality_mean", "mean_GDPpc")], use = "complete.obs", method = "pearson")

# Creating a table 
table3 <- tab_corr(correlation_tweets,
          title = "Correlation between the linguistic fractionalization and development indicators.",
          triangle = "lower",
          string.diag = c(rep(1, times = ncol(correlation))), #correlation of the same variable equals 1 
          fade.ns = FALSE)

table3
Correlation between the linguistic fractionalization and development indicators.
  language_fractionalization_index_tweets mean_unemployment mean_Life_Expectacy mean_Mobile_Sub Infant_Mortality_mean mean_GDPpc
language_fractionalization_index_tweets 1          
mean_unemployment -0.196 1        
mean_Life_Expectacy 0.186 0.142 1      
mean_Mobile_Sub 0.067 -0.305 -0.414 1    
Infant_Mortality_mean -0.205 0.169 -0.697 -0.095 1  
mean_GDPpc 0.332 -0.263 0.724 -0.173 -0.535 1
Computed correlation used pearson-method with listwise-deletion.
# When correlating the language fractionalization Twitter-based index with WDI without any twitter conditions, we cannot identify any correlations between the WDIs and our index, as the highest correlation is for the mean GDP per capita with a value of 0.130. When adding the conditionality of a minimum of 500 tweets, the correlation values increase. The correlation between the mean GDP per capita and our language fractionalization Twitter-based index increased from 0.130 to 0.332. All other variables also increase in absolute value and two change the direction from positive to negative correlation (i.e.Infant Mortality and Unemployment). Thus, we can argue that looking at countries with at least 500 tweets gives a more accurate representation of the countries' language fractionalization. However, it is important to note that by adding this conditionality we also loose 15 observations/countries of our sample. 
  1. Moving away from language fractionalization, next explore the Athena API from the World Health Organizsation https://www.who.int/data/gho/info/athena-api as one further example of an API. Read into its documentation and decide on some data to query using the httr package. Then analyze cross-country differences that you are interested in through a series of visualizations and computations based on the WHO and World Bank data. For ideas on visualizing cross-country differences, e.g. have a look at the website https://ourworldindata.org/. Yet, all data to answer the question needs to be obtained from the WHO Athena API and World Bank API through code in this document. (20 points)
# Function to query the WHO API to ask for data
get_WHO <- function(indicator){
    # Description: function that gets the indicators stated from the WHO API
    # args:
      # indicator: a string of length n with the name of the indicator
    # returns: 
      # WHO_indicator: a data frame with the indicator
  tryCatch({
    # Calling the API
    WHO_API <- paste0("http://apps.who.int/gho/athena/api/GHO/", indicator, "?format=json") 
    WHO_data <- GET(url = WHO_API) %>% 
                content("text", encoding = "UTF-8") %>%
                fromJSON(flatten = TRUE)
    
    WHO_indicator <- WHO_data[[5]] %>% 
                     tibble() %>% 
                     # Unnesting the nested Json.
                     # Ref: https://medium.com/@Periscopic/cozy-collecting-part-2-5e717588e37b
                     unnest_wider(Dim) %>% 
                     unnest(category, code) %>% 
                     pivot_wider(names_from = category, values_from = code) %>% 
                     mutate(country_code = countrycode(COUNTRY, 
                                                       origin = "iso3c", 
                                                       destination = "iso2c"))
    
    return(WHO_indicator)},
    error = function(e){"Not a valid indicator"})
}

# Calling the WHO API to get obesity rate among adults in different regions and countries
Obesity_rate <- get_WHO("NCD_BMI_30C")
## Warning: unnest() has a new interface. See ?unnest for details.
## Try `df %>% unnest(c(category, code))`, with `mutate()` if needed
## Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: SDF
# Cleaning the data for merging and visualization (filtering out unnecessary information)
Obesity_rate <- Obesity_rate %>% 
                select(country_code, SEX, YEAR, REGION, value.numeric) %>%  
                filter(!is.na(country_code))

#Calculating the average obesity rate in each country from 2015 to 2020
Obesity_rate <- Obesity_rate %>% 
                mutate(YEAR = as.numeric(YEAR)) %>% 
                filter(YEAR >= 2015 & YEAR <= 2020) %>% 
                group_by(country_code, REGION) %>% 
                summarise(Average_obesity = round(mean(value.numeric), 2)) %>%
                ungroup()
## `summarise()` has grouped output by 'country_code'. You can override using the
## `.groups` argument.
# Merging the data sets of GDP per capita and the obesity rate
Obesity_GDPpc <- merge(GDP_per_capita, Obesity_rate, by = "country_code", all.x = TRUE) %>%
                 mutate(COUNTRY = countrycode(country_code, origin = "iso2c",
                                              destination = "country.name"),
                        mean_GDPpc = round(mean_GDPpc, 2)) %>% na.omit()

# Visualizing a scatter plot
relation_fig <- plot_ly(data = Obesity_GDPpc,
                        x = ~mean_GDPpc,
                        y = ~Average_obesity) %>% 
                # Plotting the points
                add_trace(type = "scatter", mode = "markers",    
                          # Adding styles for the region categorization            
                          color = ~REGION,
                          opacity = 0.7,
                          marker = list(size = 10),               
                          # Adding hover text info for an interactive interface
                          size = 1.5,
                          hoverinfo = "text",
                          text = ~paste0("Country: ", COUNTRY,
                                         "<br>Region: ", REGION,
                                         "<br>Average GDP Per Capita USD (2015-2020): ", mean_GDPpc,
                                         "<br>Average Adult Obesity Rate (2015-2020): ", Average_obesity, "%")) %>% 
                # Formatting the plot
                layout(xaxis = list(title = "Average GDP Per Capita USD (2015-2020)",
                                    type = "log",
                                    tickvals = c(500, 1000, 5000, 10000, 50000, 100000)),
                       yaxis = list(title = "Average Adult Obesity Rate (2015-2020)"),
                       title = "Relationship between GDP Per Capita and Adult Obesity Rate")


regression_line <- lm(Average_obesity ~ log(mean_GDPpc), Obesity_GDPpc)
summary(regression_line)
## 
## Call:
## lm(formula = Average_obesity ~ log(mean_GDPpc), data = Obesity_GDPpc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.565  -5.621  -0.850   4.176  38.642 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -19.1521     4.4206  -4.332 2.44e-05 ***
## log(mean_GDPpc)   4.4489     0.5042   8.824 9.25e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.447 on 181 degrees of freedom
## Multiple R-squared:  0.3008, Adjusted R-squared:  0.2969 
## F-statistic: 77.86 on 1 and 181 DF,  p-value: 9.249e-16
# Interpretation: 
# Given the results, we can see that GDP per capita shows a positive relation with obesity rate.
# The estimate parameter of log(GDP) is 4.4489, in other words, an increase of 1% in the GDP per capita increases in 4.4489*log(1.01) = 0.04 the obesity rate. Besides, this parameter is statistically significant at 95%. Nevertheless, the R square is 0.3, which means that the dependent variable explains little of the independent variable.

# Adding regression line
relation_fig <- relation_fig %>% 
                add_lines(data = Obesity_GDPpc,
                          x = ~mean_GDPpc,
                          y = ~fitted(regression_line),
                          line = list(width = 1, dash = "dot", color="blue"),
                          showlegend = FALSE,
                          hoverinfo = "none",
                          mode = "lines") %>% 
                 # Changing legend's position
                 layout(xaxis = list(showgrid = FALSE),
                        yaxis = list(showgrid = FALSE),
                        legend = list(y = 0.5,
                        font = list(size = 10))) 

relation_fig
# Creating a density plot to show the distribution of obesity rate by region
density_plot <- ggplot(Obesity_GDPpc) +
                geom_density(aes(x = Average_obesity, 
                                 fill = REGION),
                                  color = NA) + 
                scale_fill_manual(values = alpha(c("green", "orange", "blue", 
                                                   "purple", "red", "yellow"), 0.4)) +
                # Removing background color and grids
                theme(legend.position = "bottom",
                      panel.background = element_blank(),
                      panel.grid = element_blank(),
                      axis.line.y = element_blank(), 
                      axis.ticks.y = element_blank(), 
                      axis.text.y = element_blank(), 
                      axis.title.y = element_blank()) +
                # Editing the title
                labs (title = "Distribution of Average Obesity Rate (2015-2020) by Region") +
                theme(plot.title = element_text(size = 10, face = "bold", hjust = 0.5)) +
                xlab("Average Obesity Rate (2015-2020)") +
                scale_y_continuous(expand = expansion (mult = 0, add = c(0, 0)))
  
density_plot

# Annotations: "WHO Region Classification: African Region (AFR), Americas Region (AMR), South-East Asian Region (SEAR), European Region (EUR), Eastern Mediterranean Region (EMR), Western Pacific Region (WPR)

#Findings: 
# 1. there is a positive correlation between the GDP per capita and the obesity rate. As we see in the regression analysis the obesity rate increases as GDP increases possibly because GDP is positive correlated with welfare.  
# 2. In terms of region, fluent countries (e.g Europe and North America) generally showed a greater tendency of obesity, and vice versa (e.g. Africa). However, the west pacific region showed a great disparity. 
# 3. In general, the Asian part of the region (such as China, Japan, and Korea) had an average obesity rate much lower than the average, while the oceania had the obesity rate much higher than the average.
# 4. Most of countries had an obesity rate between 20% and 40%.